In [1]:
import sys
from ete3 import Tree, TreeStyle, NodeStyle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import scipy
# Reads in trees
substitutionTreeFile="Concatenate5662_CAT_7000trees_consensus_rooted"
duplicationTreeFile="DuplicationsNoNumber.tree"
transferTreeFile="TransfersNoNumber.tree"
lossTreeFile="LossesNoNumber.tree"
def readTreeFromFile(file):
try:
f=open(file, 'r')
except IOError:
print ("Unknown file: "+file)
sys.exit()
line = ""
for l in f:
line += l.strip()
f.close()
t = Tree( line )
return t
substitutionTree = readTreeFromFile(substitutionTreeFile)
duplicationTree = readTreeFromFile(duplicationTreeFile)
transferTree = readTreeFromFile(transferTreeFile)
lossTree = readTreeFromFile(lossTreeFile)
In [2]:
def extractBranchLengths(t):
bls = list()
for node in t.traverse("postorder"):
bls.append(node.dist)
return bls
substitutionTreeBls = extractBranchLengths(substitutionTree)
duplicationTreeBls = extractBranchLengths(duplicationTree)
transferTreeBls = extractBranchLengths(transferTree)
lossTreeBls = extractBranchLengths(lossTree)
In [3]:
def plotAndComputeCorrelation(x,y,namex, namey, logyn, logxn):
print("Pearson correlation coefficient and p-value: "+ str(scipy.stats.pearsonr(x, y)))
#Plotting:
plt.plot(x, y, 'bo')
plt.xlabel(namex, fontsize=15)
plt.ylabel(namey, fontsize=15)
#plt.legend(['data'], loc='upper left')
if logyn:
plt.yscale('log')
if logxn:
plt.xscale('log')
First, substitutions vs duplications
In [4]:
%matplotlib inline
plotAndComputeCorrelation(substitutionTreeBls,duplicationTreeBls,'Substitutions', 'Duplications', True, True)
Second, substitutions vs transfers.
In [5]:
%matplotlib inline
plotAndComputeCorrelation(substitutionTreeBls,transferTreeBls,'Substitutions', 'Transfers', True, True)
Third, substitutions vs losses.
In [6]:
%matplotlib inline
plotAndComputeCorrelation(substitutionTreeBls,lossTreeBls,'Substitutions', 'Losses', True, True)
Fourth, Duplications vs transfers.
In [7]:
%matplotlib inline
plotAndComputeCorrelation(duplicationTreeBls,transferTreeBls,'Duplications', 'Transfers', True, True)
Fifth, transfer vs losses.
In [8]:
%matplotlib inline
plotAndComputeCorrelation(transferTreeBls,lossTreeBls,'Transfers', 'Losses', True, True)
In [9]:
# We compute all the leaf to root distances, and we compute the normalized variance across leaves.
def computeUltrametricityIndex(t):
root = t.get_tree_root()
leaves = root.get_leaves()
distances=list()
for leaf in leaves:
distances.append(root.get_distance(leaf))
meanDist = np.mean(distances)
for i in range(len(distances)):
distances[i] = distances[i] / meanDist
return np.var(distances)
print ("Deviance from ultrametricity of the substitution tree: " + str(computeUltrametricityIndex(substitutionTree)))
print ("Deviance from ultrametricity of the duplication tree: " + str(computeUltrametricityIndex(duplicationTree)))
print ("Deviance from ultrametricity of the transfer tree: " + str(computeUltrametricityIndex(transferTree)))
print ("Deviance from ultrametricity of the loss tree: " + str(computeUltrametricityIndex(lossTree)))
It appears that the transfer tree is more ultrametric than the loss tree, which is more ultrametric than the substitution tree, itself finally much more ultrametric than the duplication tree.
Finally I transformed the 4 trees above into ultrametric trees using fastdate. The duplication tree could not be transformed, there was a problem in fastdate. Eric evaluated the other trees in terms of the numbers of undated transfers they disagree with. The results were the following:
Transfer tree: 2848
Loss tree: 2992
Substitution tree: 2934
They are all on the good side of the random distribution, but within the distribution. It seems like fastdate provides dated trees that are not as good as Phylobayes's relaxed clock models. It is interesting to note that the transfer-based clocklike tree is the best, because the two transfer-based signals are not quite the same: one one hand, it's the general numbers of transfers, on the other, it's where they come from and where to go to.
In [ ]: